This document aims at exploring two datasets, one in 2016 on 6 individuals and another one in 2018 on 4 individuals. For that purpose, we need first to load the ontodive package to load data.
# load library
library(ontodive)
# load data
data("data_nes", package = "ontodive")
Let’s have a look at what’s inside data_nes$data_2016:
# list structure
str(data_nes$year_2016, max.level = 1, give.attr = F, no.list = T)
## $ ind_3449:Classes 'data.table' and 'data.frame': 384 obs. of 23 variables:
## $ ind_3450:Classes 'data.table' and 'data.frame': 253 obs. of 23 variables:
## $ ind_3456:Classes 'data.table' and 'data.frame': 253 obs. of 23 variables:
## $ ind_3457:Classes 'data.table' and 'data.frame': 253 obs. of 23 variables:
## $ ind_3460:Classes 'data.table' and 'data.frame': 253 obs. of 23 variables:
## $ ind_3463:Classes 'data.table' and 'data.frame': 213 obs. of 23 variables:
A list of 6
data.frames, one for each seal
For convenience, we aggregate all 6 individuals into one dataset.
# combine all individuals
data_2016 = rbindlist(data_nes$year_2016, use.name = TRUE, idcol = TRUE)
# display
DT::datatable(data_2016[sample.int(.N,100),],options=list(scrollX=T))
# raw_data
data_2016[, .(
nb_days_recorded = uniqueN(as.Date(date)),
max_depth = max(maxpress_dbars),
sst_mean = mean(sst2_c),
sst_sd = sd(sst2_c)
), by =.id] %>%
sable(caption="Summary diving information relative to each 2016 individual",
digits=2)
| .id | nb_days_recorded | max_depth | sst_mean | sst_sd |
|---|---|---|---|---|
| ind_3449 | 384 | 1118.81 | 26.54 | 129.91 |
| ind_3450 | 253 | 954.81 | 130.29 | 367.69 |
| ind_3456 | 253 | 697.63 | 125.24 | 360.39 |
| ind_3457 | 253 | 572.94 | 135.56 | 374.71 |
| ind_3460 | 253 | 832.25 | 65.24 | 249.12 |
| ind_3463 | 213 | 648.81 | 212.19 | 462.88 |
Well, it seems that sst is a bit odd. Let’s have a look at its distribution.
ggplot(data_2016, aes(x = sst2_c, fill = .id)) +
geom_histogram(show.legend = FALSE) +
facet_wrap(.id ~ .) +
theme_jjo()
Figure 1.1: Distribution of raw sst2 for the four individuals in 2016
Let’s remove any data with a sst2_c > 500.
data_2016_filter = data_2016[sst2_c < 500, ]
ggplot(data_2016_filter, aes(x = sst2_c, fill = .id)) +
geom_histogram(show.legend = FALSE) +
facet_wrap(.id ~ .) +
theme_jjo()
Figure 1.2: Distribution of filtered sst2 for the four individuals in 2016
Well, that seems to be much better! In the process of filtering out odd values, we removed 116 rows this way:
# nbrow removed
data_2016[sst2_c>500,.(nb_row_removed = .N),by=.id] %>%
sable(caption = "# of rows removed by 2016-individuals")
| .id | nb_row_removed |
|---|---|
| ind_3449 | 4 |
| ind_3450 | 23 |
| ind_3456 | 22 |
| ind_3457 | 24 |
| ind_3460 | 10 |
| ind_3463 | 33 |
# max depth
ggplot(data_2016,
aes(y = -maxpress_dbars, x=as.Date(date), col=.id)) +
geom_path(show.legend = FALSE) +
geom_point(data = data_2016[sst2_c>500,], col="black") +
scale_x_date(date_labels = "%m/%Y") +
labs(y="Pressure (dbar)", x="Date") +
facet_wrap(.id ~ .) +
theme_jjo() +
theme(axis.text.x = element_text(angle = 45, hjust=1))
Figure 1.3: Where and when the sst2 outliers occured
Well this latter plot highlights several points:
ind_3449 seems to have return ashore twice during his trackLet’s see if we can double check that, using a map.
# interactive map
leaflet() %>%
setView(lng = -122, lat = 38, zoom = 2) %>%
addTiles() %>%
addPolylines(lat = data_2016[.id == "ind_3449", latitude_degs],
lng = data_2016[.id == "ind_3449", longitude_degs],
weight = 2) %>%
addCircleMarkers(lat = data_2016[.id == "ind_3449" & sst2_c>500, latitude_degs],
lng = data_2016[.id == "ind_3449" & sst2_c>500, longitude_degs],
radius = 3,
stroke=FALSE,
color="red",
fillOpacity=1)
Figure 1.4: It is supposed to be the track of ind_3449… (red dots are the location of removed rows)
… these coordinates seem weird !
# summary of the coordinates by individuals
data_2016[, .(.id, longitude_degs, latitude_degs)] %>%
tbl_summary(by = .id) %>%
modify_caption("Summary of `longitude_degree` and `latitude_degree`")
| Characteristic | ind_3449, N = 3841 | ind_3450, N = 2531 | ind_3456, N = 2531 | ind_3457, N = 2531 | ind_3460, N = 2531 | ind_3463, N = 2131 |
|---|---|---|---|---|---|---|
| longitude_degs | -119 (-132, -69) | -124 (-144, -99) | -112 (-134, -6) | -122 (-132, -97) | -122 (-144, -85) | -121 (-134, -93) |
| latitude_degs | 39 (-67, 68) | 60 (-63, 68) | 56 (-63, 68) | 44 (-63, 68) | 59 (-63, 68) | 63 (42, 72) |
|
1
Median (IQR)
|
||||||
# distribution coordinates
ggplot(
data = melt(data_2016[, .(Longitude = longitude_degs,
Latitude = latitude_degs,
.id)], id.vars =
".id", value.name = "Coordinate"),
aes(x = Coordinate, fill = .id)
) +
geom_histogram(show.legend = F) +
facet_grid(variable ~ .id) +
theme_jjo()
Figure 1.5: Distribution of coordinates per seal
There is definitely something wrong with these coordinates (five seals would have crossed the equator…), but the representation of the track can also be improved! Here are the some points to explore:
longitude a part of the data seems to have a wrong sign, resulting in these distribution, that appear to be cut offlatitude, well this is ensure but maybe the same problem occursLet’s try to play on coordinates’ sign to see if we can display something that makes more sense.
# interactive map
leaflet() %>%
setView(lng = -122, lat = 50, zoom = 3) %>%
addTiles() %>%
addPolylines(lat = data_2016[.id == "ind_3449", abs(latitude_degs)],
lng = data_2016[.id == "ind_3449", -abs(longitude_degs)],
weight = 2)
Figure 1.6: An attempt to display the ind_3449’s track
I’ll better ask Roxanne!
# build dataset to check for missing values
dataPlot = melt(data_2016_filter[, .(.id, is.na(.SD)), .SDcol = -c(".id",
"rec#",
"date",
"time")])
# add the id of rows
dataPlot[, id_row := c(1:.N), by = c("variable",".id")]
# plot
ggplot(dataPlot, aes(x = variable, y = id_row, fill = value)) +
geom_tile() +
labs(x = "Attributes", y = "Rows") +
scale_fill_manual(values = c("white", "black"),
labels = c("Real", "Missing")) +
facet_wrap(.id ~ ., scales = "free_y") +
theme_jjo() +
theme(
legend.position = "top",
axis.text.x = element_text(angle = 45, hjust = 1),
legend.key = element_rect(colour = "black")
)
Figure 1.7: Check for missing value in 2016-individuals
Let’s have a look at what’s inside data_nes$data_2018:
# list structure
str(data_nes$year_2018, max.level = 1, give.attr = F, no.list = T)
## $ ind_2018070:Classes 'data.table' and 'data.frame': 22393 obs. of 42 variables:
## $ ind_2018072:Classes 'data.table' and 'data.frame': 29921 obs. of 42 variables:
## $ ind_2018074:Classes 'data.table' and 'data.frame': 38608 obs. of 42 variables:
## $ ind_2018080:Classes 'data.table' and 'data.frame': 19028 obs. of 42 variables:
A list of 4
data.frames, one for each seal
For convenience, we aggregate all 4 individuals into one dataset.
# combine all individuals
data_2018 = rbindlist(data_nes$year_2018, use.name = TRUE, idcol = TRUE)
# display
DT::datatable(data_2018[sample.int(.N,100),],options=list(scrollX=T))
# raw_data
data_2018[, .(
nb_days_recorded = uniqueN(as.Date(date)),
nb_dives = .N,
maxdepth_mean = mean(maxdepth),
dduration_mean = mean(dduration),
botttime_mean = mean(botttime),
pdi_mean = mean(pdi, na.rm=T)
), by =.id] %>%
sable(caption="Summary diving information relative to each 2018 individual",
digits=2)
| .id | nb_days_recorded | nb_dives | maxdepth_mean | dduration_mean | botttime_mean | pdi_mean |
|---|---|---|---|---|---|---|
| ind_2018070 | 232 | 22393 | 305.52 | 783.27 | 243.22 | 109.55 |
| ind_2018072 | 342 | 29921 | 357.86 | 876.96 | 278.02 | 104.90 |
| ind_2018074 | 371 | 38608 | 250.67 | 686.25 | 291.89 | 302.77 |
| ind_2018080 | 215 | 19028 | 296.50 | 867.69 | 339.90 | 103.51 |
Very nice dataset :)
# build dataset to check for missing values
dataPlot = melt(data_2018[, .(.id, is.na(.SD)), .SDcol = -c(".id",
"divenumber",
"year",
"month",
"day",
"hour",
"min",
"sec",
"juldate",
"divetype",
"date")])
# add the id of rows
dataPlot[, id_row := c(1:.N), by = c("variable",".id")]
# plot
ggplot(dataPlot, aes(x = variable, y = id_row, fill = value)) +
geom_tile() +
labs(x = "Attributes", y = "Rows") +
scale_fill_manual(values = c("white", "black"),
labels = c("Real", "Missing")) +
facet_wrap(.id ~ ., scales = "free_y") +
theme_jjo() +
theme(
legend.position = "top",
axis.text.x = element_text(angle = 45, hjust = 1),
legend.key = element_rect(colour = "black")
)
Figure 2.1: Check for missing value in 2018-individuals
So far so good, only few variables seems to have missing values:
# table with percent
table_inter = data_2018[, lapply(.SD, function(x) {
round(length(x[is.na(x)]) * 100 / length(x), 1)
}), .SDcol = -c(
".id",
"divenumber",
"year",
"month",
"day",
"hour",
"min",
"sec",
"juldate",
"divetype",
"date"
)]
# find which are different from 0
cond_inter = sapply(table_inter,function(x){x==0})
# display the percentages that are over 0
table_inter[,which(cond_inter) := NULL] %>%
sable(caption="Percentage of missing values per columns having missing values!")%>%
scroll_box(width = "100%")
| lightatsurf | lattenuation | euphoticdepth | thermoclinedepth | driftrate | benthicdivevertrate | cornerindex | foragingindex | verticalspeed90perc | verticalspeed95perc |
|---|---|---|---|---|---|---|---|---|---|
| 26.3 | 89 | 62.6 | 1.3 | 0.5 | 22.7 | 75.8 | 0.5 | 0.1 | 0.1 |
Ok, let’s have a look at all the data. But first, we have to remove outliers. Some of them are quiet easy to spot looking at the distribution of dive duration:
ggplot(data_2018[,.SD][,state:="Before"],
aes(x=dduration, fill = .id))+
geom_histogram(show.legend = FALSE)+
geom_vline(xintercept = 3000, linetype = "longdash") +
facet_grid(state~.id,
scales="free")+
labs(y="# of dives", x="Dive duration (s)")+
theme_jjo()
Figure 2.2: Distribution of dduration for each seal. The dashed line highlight the “subjective” threshold used to remove outliers (3000 sec)
ggplot(data_2018[dduration<3000,][][,state:="After"],
aes(x=dduration, fill = .id))+
geom_histogram(show.legend = FALSE)+
geom_vline(xintercept = 3000, linetype = "longdash") +
facet_grid(state~.id,
scales="free")+
labs(x="# of dives", y="Dive duration (s)")+
theme_jjo()
Figure 2.3: Same distribution of dduration for each seal but after removing any dduration > 3000 sec. The dashed line highlight the “subjective” threshold used to remove outliers
It seems muche better, so let’s remove any rows with dduration > 3000 sec.
# filter data
data_2018_filter = data_2018[dduration < 3000, ]
# nbrow removed
data_2018[dduration>= 3000,.(nb_row_removed = .N),by=.id] %>%
sable(caption = "# of rows removed by 2018-individuals")
| .id | nb_row_removed |
|---|---|
| ind_2018070 | 3 |
| ind_2018072 | 1 |
| ind_2018074 | 33 |
names_display = names(data_2018_filter[, -c(
".id",
"date",
"divenumber",
"year",
"month",
"day",
"hour",
"min",
"sec",
"juldate",
"divetype",
"euphoticdepth",
"thermoclinedepth",
"day_departure"
)])
for (i in names_display) {
cat('####', i, '{-} \n')
if (i == "maxdepth") {
print(
ggplot() +
geom_point(
data = data_2018_filter[, .(.id,
date,
thermoclinedepth)],
aes(
x = as.Date(date),
y = -thermoclinedepth,
colour = "Thermocline (m)"
),
alpha = .2,
size = .5
) +
geom_point(
data = data_2018_filter[, .(.id,
date,
euphoticdepth)],
aes(
x = as.Date(date),
y = -euphoticdepth,
colour = "Euphotic (m)"
),
alpha = .2,
size = .5
) +
scale_colour_manual(
values = c("Thermocline (m)" = 'red',
"Euphotic (m)" = "black"),
name = "Zone"
) +
new_scale_color() +
geom_point(
data = melt(data_2018_filter[, .(.id, date, get(i))], id.vars = c(".id", "date")),
aes(
x = as.Date(date),
y = -value,
col = .id
),
alpha = 1 / 10,
size = .5,
show.legend = FALSE
) +
facet_wrap(. ~ .id, scales = "free") +
scale_x_date(date_labels = "%m/%Y") +
labs(x = "Date", y = "Maximum Depth (m)") +
theme_jjo() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position="bottom")
)
cat("<blockquote> Considering `ind_2018074` has slightly different values than other individuals for the thermocline depth, it would be interesting to see where the animal went. </blockquote>")
} else {
print(
ggplot(
data = melt(data_2018_filter[, .(.id, date, get(i))], id.vars = c(".id", "date")),
aes(
x = as.Date(date),
y = value,
col = .id
)
) +
geom_point(
show.legend = FALSE,
alpha = 1 / 10,
size = .5
) +
facet_wrap(. ~ .id, scales = "free") +
scale_x_date(date_labels = "%m/%Y") +
labs(x = "Date", y = i) +
theme_jjo() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
)
}
cat(' \n \n')
}
Considering ind_2018074 has slightly different values than other individuals for the thermocline depth, it would be interesting to see where the animal went.